%matplotlib inline
from IPython.display import display, display_markdown
import subprocess as sp
import numpy as np
import pandas as pd
import seaborn as sns
import pymc3 as pm
import arviz as az
import bambi
import copy
import warnings
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10, 8]
plt.rcParams['figure.dpi'] = 300
from scipy.stats import pearsonr, spearmanr
from itertools import chain
from collections import Counter
from subs2vec.utensils import log_timer
from subs2vec.vecs import Vectors
from subs2vec.neighbors import compute_nn
def display_md(md, **kwargs):
return display_markdown(md, raw=True, **kwargs)
def convert_notebook(title, output='html'):
convert = sp.run(f'jupyter nbconvert {title}.ipynb --to {output} --output {title}.{output}'.split(' '))
if convert.returncode == 0:
display_md(f'Jupyter notebook `{title}` converted successfully.')
else:
display_md(f'Error: encountered problem converting Jupyter notebook `{title}`')
def download(fname):
dl = sp.run(f'wget {fname}'.split(' '))
if dl.returncode == 0:
display_md(f'Download of `{fname}` succesful.')
else:
display_md(f'Download of `{fname}` failed.')
@log_timer
def filter_vecs(vecs, filter_words):
filtered_vecs = copy.deepcopy(vecs)
filtered_vecs.vectors = filtered_vecs.vectors[np.isin(filtered_vecs.words, filter_words)]
filtered_vecs.words = filtered_vecs.words[np.isin(filtered_vecs.words, filter_words)]
filtered_vecs.n = len(filtered_vecs.words)
display_md(f'Filtered {vecs.n} vectors, {filtered_vecs.n} remaining.')
return filtered_vecs
def norm(x):
return x / np.linalg.norm(x, 2)
sns.set(style='whitegrid')
pd.options.mode.chained_assignment = None
df = pd.read_csv('color_semantics_ratings.tsv', sep='\t')
display(df)
| participant | white | red | orange | yellow | green | blue | purple | brown | black | dimension | group | pp_id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 7 | 7 | 5 | 1 | 1 | 1 | 3 | 7 | cold-hot | sighted | sighted_1 |
| 1 | 1 | 7 | 1 | 4 | 2 | 3 | 3 | 6 | 6 | 7 | ripe-unripe | sighted | sighted_1 |
| 2 | 1 | 1 | 5 | 6 | 7 | 4 | 2 | 3 | 7 | 6 | new-old | sighted | sighted_1 |
| 3 | 1 | 1 | 7 | 2 | 1 | 4 | 2 | 3 | 5 | 7 | submissive-aggressive | sighted | sighted_1 |
| 4 | 1 | 1 | 7 | 6 | 1 | 2 | 2 | 5 | 3 | 5 | selfless-jealous | sighted | sighted_1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 539 | 12 | 1 | 2 | 1 | 2 | 2 | 3 | 2 | 3 | 4 | soft-hard | blind | blind_12 |
| 540 | 12 | 4 | 3 | 3 | 4 | 2 | 2 | 3 | 2 | 5 | light-heavy | blind | blind_12 |
| 541 | 12 | 2 | 4 | 2 | 1 | 2 | 1 | 2 | 3 | 2 | relaxed-tense | blind | blind_12 |
| 542 | 12 | 4 | 2 | 1 | 1 | 1 | 3 | 2 | 3 | 5 | alive-dead | blind | blind_12 |
| 543 | 12 | 6 | 7 | 4 | 3 | 4 | 4 | 1 | 2 | 5 | fast-slow | blind | blind_12 |
544 rows × 13 columns
# these are the colors in the data
colors = ['white', 'red', 'orange', 'yellow', 'green', 'blue', 'purple', 'brown', 'black']
# melt
df_orig = df.melt(
id_vars=['group', 'dimension', 'pp_id'],
value_vars=colors,
var_name='color',
value_name='rating',
)
# pull out dimension words
dimension_labels = df_orig['dimension'].unique()
dimension_pairs = [pair.split('-') for pair in dimension_labels]
dimensions = list(chain(*dimension_pairs))
# add experiment and self vs. other variables for when we add the replication experiment later
df_orig['experiment'] = 'original'
df_orig['self_vs_other'] = 'self'
display(df_orig)
| group | dimension | pp_id | color | rating | experiment | self_vs_other | |
|---|---|---|---|---|---|---|---|
| 0 | sighted | cold-hot | sighted_1 | white | 1 | original | self |
| 1 | sighted | ripe-unripe | sighted_1 | white | 7 | original | self |
| 2 | sighted | new-old | sighted_1 | white | 1 | original | self |
| 3 | sighted | submissive-aggressive | sighted_1 | white | 1 | original | self |
| 4 | sighted | selfless-jealous | sighted_1 | white | 1 | original | self |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 4891 | blind | soft-hard | blind_12 | black | 4 | original | self |
| 4892 | blind | light-heavy | blind_12 | black | 5 | original | self |
| 4893 | blind | relaxed-tense | blind_12 | black | 2 | original | self |
| 4894 | blind | alive-dead | blind_12 | black | 5 | original | self |
| 4895 | blind | fast-slow | blind_12 | black | 5 | original | self |
4896 rows × 7 columns
df_rep = pd.read_csv('replication_subj_pool.csv')
# little bit of data munging, drop test participant and catch trials
df_rep = df_rep[(df_rep['pp_id'] != 3) & (df_rep['question_type'] != 'catch')]
df_rep = df_rep.drop(columns=['question_type', 'prompt_pre_1'])
# melt to long format
df_rep = df_rep.melt(
id_vars=['dimension', 'color', 'pp_id'],
value_vars=['value', 'others_choice'],
var_name='self_vs_other',
value_name='rating',
)
# more data munging
df_rep['pp_id'] = 'sighted_' + df_rep['pp_id'].astype(str)
df_rep['self_vs_other'] = df_rep['self_vs_other'].replace({'value': 'self', 'others_choice': 'other'})
df_rep['group'] = 'sighted'
df_rep['experiment'] = 'replication_1'
# there is a weird typo in one of the dimensions (?), so let's correct that here as well
df_rep['dimension'] = df_rep['dimension'].replace({'like-dis...like': 'like-dislike'})
display(df_rep)
| dimension | color | pp_id | self_vs_other | rating | group | experiment | |
|---|---|---|---|---|---|---|---|
| 0 | clean-dirty | yellow | sighted_69819 | self | 5 | sighted | replication_1 |
| 1 | soft-hard | yellow | sighted_69819 | self | 2 | sighted | replication_1 |
| 2 | ripe-unripe | yellow | sighted_69819 | self | 1 | sighted | replication_1 |
| 3 | selfless-jealous | yellow | sighted_69819 | self | 5 | sighted | replication_1 |
| 4 | high-low | yellow | sighted_69819 | self | 1 | sighted | replication_1 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 9567 | like-dislike | orange | sighted_69785 | other | 4 | sighted | replication_1 |
| 9568 | new-old | orange | sighted_69785 | other | 4 | sighted | replication_1 |
| 9569 | clean-dirty | orange | sighted_69785 | other | 5 | sighted | replication_1 |
| 9570 | relaxed-tense | orange | sighted_69785 | other | 5 | sighted | replication_1 |
| 9571 | active-passive | orange | sighted_69785 | other | 3 | sighted | replication_1 |
9572 rows × 7 columns
df_read = pd.read_csv('semantic_diff_with_readmeasure.csv').drop(columns=['Unnamed: 0', 'X'])
display(df_read.groupby('subj_id').mean())
| value | others_choice | art | fiction | nonfiction | Q9_1 | Q9_2 | Q9_3 | Q9_4 | Q9_5 | ... | Q9_17 | Q9_18 | Q9_19 | Q9_20 | Q9_21 | composite_read | upper_art | upper_fiction | upper_nonfiction | upper_read_motivation | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| subj_id | |||||||||||||||||||||
| 67590 | 3.518519 | 3.450617 | 18.0 | 1.0 | 3.0 | 0.0 | 2.0 | 0.0 | 1.0 | 1.0 | ... | -1.0 | 2.0 | 1.0 | 1.0 | 1.0 | 0.722222 | 1.0 | 1.0 | 1.0 | 1.0 |
| 67597 | 3.895062 | 3.820988 | 10.0 | 0.0 | 0.0 | 0.0 | -1.0 | -1.0 | -2.0 | -1.0 | ... | -1.0 | 1.0 | 1.0 | 1.0 | 0.0 | -0.555556 | 1.0 | 0.0 | 0.0 | 0.0 |
| 67648 | 4.129630 | 3.913580 | 9.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 1.0 | 0.0 | 1.0 | 1.0 |
| 67653 | 3.962963 | 3.925926 | 10.0 | 4.0 | 3.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 1.0 | -1.0 | 0.0 | 0.0 | -0.222222 | 1.0 | 1.0 | 1.0 | 0.0 |
| 67656 | 3.469136 | 3.654321 | 7.0 | 0.0 | 0.0 | 1.0 | -2.0 | -2.0 | -2.0 | -2.0 | ... | 0.0 | 2.0 | 0.0 | 0.0 | -2.0 | -1.333333 | 1.0 | 0.0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 69841 | 3.432099 | 3.518519 | 6.0 | 1.0 | 3.0 | 1.0 | 0.0 | -2.0 | -2.0 | -1.0 | ... | 1.0 | -1.0 | 1.0 | 2.0 | 0.0 | -0.722222 | 0.0 | 1.0 | 1.0 | 0.0 |
| 69860 | 3.716049 | 3.796296 | 21.0 | 0.0 | 1.0 | -1.0 | 1.0 | -1.0 | 1.0 | 1.0 | ... | 0.0 | -2.0 | 1.0 | 1.0 | 1.0 | 0.222222 | 1.0 | 0.0 | 1.0 | 1.0 |
| 69889 | 3.932099 | 4.000000 | 6.0 | 0.0 | 1.0 | 1.0 | 1.0 | -2.0 | -2.0 | -1.0 | ... | 0.0 | -1.0 | 0.0 | 0.0 | -1.0 | -0.888889 | 0.0 | 0.0 | 1.0 | 0.0 |
| 69914 | 4.037037 | 3.907407 | 26.0 | 2.0 | 4.0 | 2.0 | 1.0 | -1.0 | 2.0 | 2.0 | ... | -2.0 | 1.0 | 1.0 | 1.0 | -1.0 | 0.277778 | 1.0 | 1.0 | 1.0 | 1.0 |
| 70147 | 3.197531 | 3.450617 | 12.0 | 1.0 | 0.0 | 1.0 | 1.0 | -1.0 | 1.0 | 0.0 | ... | 1.0 | 0.0 | 1.0 | -1.0 | 1.0 | 0.111111 | 1.0 | 1.0 | 0.0 | 1.0 |
88 rows × 31 columns
df_read['reading_motivation'] = df_read.apply(
lambda x: (0
+ x['Q9_1']
+ x['Q9_2']
+ x['Q9_3']
+ x['Q9_4']
+ x['Q9_5']
+ x['Q9_6']
+ x['Q9_7']
+ x['Q9_8']
+ x['Q9_9']
+ x['Q9_10']
+ x['Q9_11']
+ x['Q9_12']
+ x['Q9_13']
+ x['Q9_14'] * -1
+ x['Q9_15']
+ x['Q9_16']
+ x['Q9_17'] * -1
+ x['Q9_18']
+ x['Q9_19']
+ x['Q9_20']
+ x['Q9_21']) / 21,
axis=1)
df_read['reading_part_of_self'] = df_read.apply(
lambda x: (0
+ x['Q9_2']
+ x['Q9_3']
+ x['Q9_4']
+ x['Q9_5']
+ x['Q9_6']
+ x['Q9_9']
+ x['Q9_10']
+ x['Q9_11']) / 8,
axis=1)
df_read['reading_efficacy'] = df_read.apply(
lambda x: (0
+ x['Q9_1']
+ x['Q9_14'] * -1
+ x['Q9_16']
+ x['Q9_17'] * -1
+ x['Q9_19']
+ x['Q9_20']) / 6,
axis=1)
df_read['reading_recognition'] = df_read.apply(
lambda x: (0
+ x['Q9_12']
+ x['Q9_13']
+ x['Q9_15']) / 3,
axis=1)
df_read['reading_other_realms'] = df_read.apply(
lambda x: (0
+ x['Q9_7']
+ x['Q9_8']
+ x['Q9_18']
+ x['Q9_21']) / 4,
axis=1)
# rename participant id column to match earlier datasets
df_read = df_read.rename(columns={'subj_id': 'pp_id'})
# melt to long format
df_read = df_read.melt(
id_vars=['dimension', 'color', 'pp_id', 'art', 'fiction', 'nonfiction', 'reading_motivation',
'reading_part_of_self', 'reading_efficacy', 'reading_recognition', 'reading_other_realms'],
value_vars=['value', 'others_choice'],
var_name='self_vs_other',
value_name='rating',
)
# more data munging
df_read['pp_id'] = 'sighted_' + df_read['pp_id'].astype(str)
df_read['self_vs_other'] = df_read['self_vs_other'].replace({'value': 'self', 'others_choice': 'other'})
df_read['group'] = 'sighted'
df_read['experiment'] = 'replication_2'
display(df_read)
| dimension | color | pp_id | art | fiction | nonfiction | reading_motivation | reading_part_of_self | reading_efficacy | reading_recognition | reading_other_realms | self_vs_other | rating | group | experiment | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | cold-hot | brown | sighted_69212 | 3.0 | 0.0 | 1.0 | 1.619048 | 2.000 | 0.666667 | 2.0 | 2.00 | self | 4 | sighted | replication_2 |
| 1 | ripe-unripe | brown | sighted_69212 | 3.0 | 0.0 | 1.0 | 1.619048 | 2.000 | 0.666667 | 2.0 | 2.00 | self | 7 | sighted | replication_2 |
| 2 | new-old | brown | sighted_69212 | 3.0 | 0.0 | 1.0 | 1.619048 | 2.000 | 0.666667 | 2.0 | 2.00 | self | 6 | sighted | replication_2 |
| 3 | submissive-aggressive | brown | sighted_69212 | 3.0 | 0.0 | 1.0 | 1.619048 | 2.000 | 0.666667 | 2.0 | 2.00 | self | 2 | sighted | replication_2 |
| 4 | selfless-jealous | brown | sighted_69212 | 3.0 | 0.0 | 1.0 | 1.619048 | 2.000 | 0.666667 | 2.0 | 2.00 | self | 5 | sighted | replication_2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 28507 | light-heavy | red | sighted_68129 | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 | other | 5 | sighted | replication_2 |
| 28508 | relaxed-tense | red | sighted_68129 | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 | other | 5 | sighted | replication_2 |
| 28509 | alive-dead | red | sighted_68129 | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 | other | 6 | sighted | replication_2 |
| 28510 | fast-slow | red | sighted_68129 | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 | other | 3 | sighted | replication_2 |
| 28511 | high-low | red | sighted_68129 | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 | other | 2 | sighted | replication_2 |
28512 rows × 15 columns
df_read.describe()
| art | fiction | nonfiction | reading_motivation | reading_part_of_self | reading_efficacy | reading_recognition | reading_other_realms | rating | |
|---|---|---|---|---|---|---|---|---|---|
| count | 27864.000000 | 27864.000000 | 27864.000000 | 27864.000000 | 27864.000000 | 27864.000000 | 27864.000000 | 27864.000000 | 28512.000000 |
| mean | 7.616279 | 0.593023 | 0.755814 | -0.107973 | -0.280523 | 0.203488 | -0.616279 | 0.151163 | 3.693147 |
| std | 6.612596 | 0.854251 | 0.987569 | 0.646210 | 0.879559 | 0.646210 | 0.910077 | 0.759909 | 1.424941 |
| min | -5.000000 | 0.000000 | 0.000000 | -1.619048 | -2.000000 | -1.000000 | -2.000000 | -2.000000 | 1.000000 |
| 25% | 3.000000 | 0.000000 | 0.000000 | -0.571429 | -1.000000 | -0.333333 | -1.333333 | -0.500000 | 3.000000 |
| 50% | 6.000000 | 0.000000 | 0.000000 | -0.119048 | -0.375000 | 0.166667 | -0.666667 | 0.250000 | 4.000000 |
| 75% | 10.000000 | 1.000000 | 1.000000 | 0.285714 | 0.250000 | 0.666667 | 0.000000 | 0.500000 | 5.000000 |
| max | 26.000000 | 4.000000 | 4.000000 | 1.619048 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 7.000000 |
corrs = df_read[['art', 'fiction', 'nonfiction', 'reading_motivation', 'reading_part_of_self',
'reading_efficacy', 'reading_recognition', 'reading_other_realms']].corr().round(2)
mask = np.zeros_like(corrs)
mask[np.triu_indices_from(mask)] = True
g = sns.heatmap(corrs, vmin=0, vmax=1, annot=True, mask=mask)
g.set_yticklabels(g.get_yticklabels(), rotation=0);
g = sns.histplot(x='art', data=df_read)
vecs = Vectors('../embeddings/fic.en.vec', n=2e5, d=300, normalize=True)
vecs = filter_vecs(vecs, np.array(colors + dimensions))
vecs_dict = vecs.as_dict()
color_vecs = filter_vecs(vecs, np.array(colors))
dimension_vecs = filter_vecs(vecs, np.array(dimensions))
dimension_pair_vecs = np.vstack([norm(vecs_dict[pair[0]] - vecs_dict[pair[1]]) for pair in dimension_pairs])
[INFO] loading vectors ../embeddings/fic.en.vec [INFO] <function Vectors.__init__ at 0x7fb0731c9d40> ran in 15.203 seconds
Filtered 200000 vectors, 43 remaining.
[INFO] <function filter_vecs at 0x7fb0734b87a0> ran in 0.745 seconds [INFO] <function Vectors.as_dict at 0x7fb0731c9f80> ran in 0.000 seconds
Filtered 43 vectors, 9 remaining.
[INFO] <function filter_vecs at 0x7fb0734b87a0> ran in 0.003 seconds
Filtered 43 vectors, 34 remaining.
[INFO] <function filter_vecs at 0x7fb0734b87a0> ran in 0.003 seconds
dimension_neighbors = compute_nn(color_vecs, dimension_vecs.vectors, dimension_vecs.words, num_neighbors=9, whole_matrix=True)
dimension_neighbors = dimension_neighbors.drop(columns=[
'neighbor -1',
'neighbor -2',
'neighbor -3',
'neighbor -4',
'neighbor -5',
'neighbor -6',
'neighbor -7',
'neighbor -8',
'neighbor -9'
]).rename(columns={'target': 'dimension'})
display(dimension_neighbors)
[INFO] <function Vectors.as_dict at 0x7fb0731c9f80> ran in 0.000 seconds [INFO] computing analogies using whole matrix additive method [INFO] <function compute_nn at 0x7fb0731c9cb0> ran in 0.005 seconds
| dimension | neighbor 0 | neighbor 1 | neighbor 2 | neighbor 3 | neighbor 4 | neighbor 5 | neighbor 6 | neighbor 7 | neighbor 8 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | like | white | black | brown | yellow | orange | purple | blue | green | red |
| 1 | old | white | black | brown | yellow | orange | blue | red | green | purple |
| 2 | new | white | green | yellow | black | red | blue | purple | brown | orange |
| 3 | light | yellow | red | green | orange | blue | purple | white | brown | black |
| 4 | hard | brown | white | black | red | orange | purple | yellow | green | blue |
| 5 | dead | black | brown | white | red | green | yellow | purple | blue | orange |
| 6 | cold | blue | white | green | black | red | brown | purple | yellow | orange |
| 7 | happy | white | yellow | orange | red | brown | blue | green | purple | black |
| 8 | hot | red | yellow | black | purple | white | orange | blue | green | brown |
| 9 | heavy | black | brown | green | red | purple | yellow | blue | white | orange |
| 10 | fast | red | purple | white | blue | yellow | green | black | brown | orange |
| 11 | soft | brown | yellow | white | green | orange | blue | purple | red | black |
| 12 | clean | white | blue | black | yellow | brown | green | red | orange | purple |
| 13 | slow | red | yellow | brown | black | green | purple | white | orange | blue |
| 14 | angry | red | orange | purple | blue | white | yellow | black | brown | green |
| 15 | alive | green | black | orange | red | blue | yellow | brown | purple | white |
| 16 | sad | brown | purple | blue | white | green | black | red | yellow | orange |
| 17 | fresh | white | green | blue | yellow | brown | red | black | purple | orange |
| 18 | calm | blue | green | brown | white | purple | yellow | black | red | orange |
| 19 | dirty | white | brown | yellow | blue | black | red | green | orange | purple |
| 20 | dull | brown | green | yellow | blue | orange | red | purple | black | white |
| 21 | relaxed | brown | white | green | yellow | red | blue | orange | black | purple |
| 22 | jealous | red | purple | orange | black | green | blue | yellow | brown | white |
| 23 | tense | white | black | red | yellow | blue | brown | purple | green | orange |
| 24 | exciting | green | orange | black | brown | purple | blue | red | white | yellow |
| 25 | active | orange | black | green | yellow | red | white | brown | blue | purple |
| 26 | ripe | purple | orange | brown | green | yellow | blue | red | white | black |
| 27 | aggressive | black | brown | orange | white | yellow | red | purple | green | blue |
| 28 | stale | brown | white | yellow | black | red | orange | green | blue | purple |
| 29 | dislike | brown | green | black | blue | orange | purple | white | yellow | red |
| 30 | passive | white | black | brown | yellow | red | green | purple | orange | blue |
| 31 | selfless | blue | purple | white | black | green | red | orange | yellow | brown |
| 32 | submissive | white | brown | yellow | purple | black | blue | orange | green | red |
| 33 | unripe | orange | purple | yellow | brown | green | blue | red | white | black |
dimension_neighbors = compute_nn(color_vecs, dimension_pair_vecs, np.array(dimension_labels), num_neighbors=9, whole_matrix=True)
dimension_neighbors = dimension_neighbors.drop(columns=[
'neighbor -1',
'neighbor -2',
'neighbor -3',
'neighbor -4',
'neighbor -5',
'neighbor -6',
'neighbor -7',
'neighbor -8',
'neighbor -9'
]).rename(columns={'target': 'dimension'})
display(dimension_neighbors)
[INFO] <function Vectors.as_dict at 0x7fb0731c9f80> ran in 0.000 seconds [INFO] computing analogies using whole matrix additive method [INFO] <function compute_nn at 0x7fb0731c9cb0> ran in 0.006 seconds
| dimension | neighbor 0 | neighbor 1 | neighbor 2 | neighbor 3 | neighbor 4 | neighbor 5 | neighbor 6 | neighbor 7 | neighbor 8 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | cold-hot | blue | white | green | brown | black | purple | red | yellow | orange |
| 1 | ripe-unripe | green | white | brown | black | blue | red | purple | yellow | orange |
| 2 | new-old | green | purple | yellow | red | blue | black | orange | white | brown |
| 3 | submissive-aggressive | white | blue | purple | yellow | brown | green | black | orange | red |
| 4 | selfless-jealous | white | blue | purple | brown | green | yellow | black | orange | red |
| 5 | active-passive | orange | green | blue | red | black | yellow | purple | brown | white |
| 6 | like-dislike | white | black | yellow | red | orange | brown | purple | blue | green |
| 7 | clean-dirty | white | blue | black | purple | green | red | orange | yellow | brown |
| 8 | fresh-stale | green | blue | purple | red | white | yellow | black | orange | brown |
| 9 | calm-angry | green | blue | brown | white | black | yellow | purple | orange | red |
| 10 | happy-sad | white | orange | yellow | red | blue | green | black | brown | purple |
| 11 | exciting-dull | black | white | purple | orange | red | green | blue | yellow | brown |
| 12 | soft-hard | yellow | brown | blue | green | orange | purple | white | red | black |
| 13 | light-heavy | yellow | orange | blue | red | green | white | purple | brown | black |
| 14 | relaxed-tense | brown | green | orange | yellow | white | blue | red | purple | black |
| 15 | alive-dead | orange | blue | green | purple | yellow | red | brown | black | white |
| 16 | fast-slow | white | blue | purple | orange | green | red | black | yellow | brown |
df_joint = pd.concat([df_orig, df_rep, df_read]).reset_index()
display(df_joint)
| index | group | dimension | pp_id | color | rating | experiment | self_vs_other | art | fiction | nonfiction | reading_motivation | reading_part_of_self | reading_efficacy | reading_recognition | reading_other_realms | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | sighted | cold-hot | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 1 | sighted | ripe-unripe | sighted_1 | white | 7 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 2 | sighted | new-old | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 3 | sighted | submissive-aggressive | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 4 | sighted | selfless-jealous | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 42975 | 28507 | sighted | light-heavy | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 |
| 42976 | 28508 | sighted | relaxed-tense | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 |
| 42977 | 28509 | sighted | alive-dead | sighted_68129 | red | 6 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 |
| 42978 | 28510 | sighted | fast-slow | sighted_68129 | red | 3 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 |
| 42979 | 28511 | sighted | high-low | sighted_68129 | red | 2 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.571429 | -0.375 | -0.833333 | -1.0 | -0.25 |
42980 rows × 16 columns
freqs = pd.read_csv('../embeddings/dedup.en.words.unigrams.tsv', sep='\t')
freqs['log_freq'] = np.log(freqs['unigram_freq'])
freqs = freqs.drop(columns='unigram_freq')
display(freqs.round(2))
| unigram | log_freq | |
|---|---|---|
| 0 | the | 17.10 |
| 1 | you | 17.06 |
| 2 | i | 17.04 |
| 3 | to | 16.78 |
| 4 | a | 16.59 |
| ... | ... | ... |
| 2397976 | tpar1 | 0.00 |
| 2397977 | giacoia | 0.00 |
| 2397978 | ourcinders | 0.00 |
| 2397979 | tourret | 0.00 |
| 2397980 | iroki | 0.00 |
2397981 rows × 2 columns
df_joint['word1'] = df_joint['dimension'].apply(lambda x: x.split('-')[0])
df_joint['word2'] = df_joint['dimension'].apply(lambda x: x.split('-')[1])
df_joint = df_joint.merge(freqs, left_on='word1', right_on='unigram', how='left')
df_joint = df_joint.merge(freqs, left_on='word2', right_on='unigram', how='left')
df_joint['frequency'] = df_joint['log_freq_x'] - df_joint['log_freq_y']
df_joint = df_joint.drop(columns=[
'unigram_x',
'unigram_y',
'log_freq_x',
'log_freq_y'
])
display(df_joint.round(2))
| index | group | dimension | pp_id | color | rating | experiment | self_vs_other | art | fiction | nonfiction | reading_motivation | reading_part_of_self | reading_efficacy | reading_recognition | reading_other_realms | word1 | word2 | frequency | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | sighted | cold-hot | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | cold | hot | -0.22 |
| 1 | 1 | sighted | ripe-unripe | sighted_1 | white | 7 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ripe | unripe | 3.49 |
| 2 | 2 | sighted | new-old | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | new | old | 0.12 |
| 3 | 3 | sighted | submissive-aggressive | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | submissive | aggressive | -2.35 |
| 4 | 4 | sighted | selfless-jealous | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | selfless | jealous | -2.96 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 42975 | 28507 | sighted | light-heavy | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | light | heavy | 1.24 |
| 42976 | 28508 | sighted | relaxed-tense | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | relaxed | tense | -0.23 |
| 42977 | 28509 | sighted | alive-dead | sighted_68129 | red | 6 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | alive | dead | -0.90 |
| 42978 | 28510 | sighted | fast-slow | sighted_68129 | red | 3 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | fast | slow | 0.76 |
| 42979 | 28511 | sighted | high-low | sighted_68129 | red | 2 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | high | low | 1.24 |
42980 rows × 19 columns
concreteness = pd.read_csv('../embeddings/en-brysbaert-2014.tsv', sep='\t')
display(concreteness)
| word | concreteness | |
|---|---|---|
| 0 | a | 1.46 |
| 1 | aardvark | 4.68 |
| 2 | aback | 1.65 |
| 3 | abacus | 4.52 |
| 4 | abandon | 2.54 |
| ... | ... | ... |
| 37053 | zoologist | 4.30 |
| 37054 | zoology | 3.37 |
| 37055 | zoom | 3.10 |
| 37056 | zoophobia | 2.04 |
| 37057 | zucchini | 4.87 |
37058 rows × 2 columns
df_joint = df_joint.merge(concreteness, left_on='word1', right_on='word', how='left')
df_joint = df_joint.merge(concreteness, left_on='word2', right_on='word', how='left')
df_joint['concreteness'] = df_joint['concreteness_x'] - df_joint['concreteness_y']
df_joint = df_joint.drop(columns=[
'word_x',
'word_y',
'concreteness_x',
'concreteness_y'
])
display(df_joint.round(2))
| index | group | dimension | pp_id | color | rating | experiment | self_vs_other | art | fiction | nonfiction | reading_motivation | reading_part_of_self | reading_efficacy | reading_recognition | reading_other_realms | word1 | word2 | frequency | concreteness | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | sighted | cold-hot | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | cold | hot | -0.22 | -0.46 |
| 1 | 1 | sighted | ripe-unripe | sighted_1 | white | 7 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ripe | unripe | 3.49 | -0.01 |
| 2 | 2 | sighted | new-old | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | new | old | 0.12 | 0.09 |
| 3 | 3 | sighted | submissive-aggressive | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | submissive | aggressive | -2.35 | -0.82 |
| 4 | 4 | sighted | selfless-jealous | sighted_1 | white | 1 | original | self | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | selfless | jealous | -2.96 | -0.56 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 42975 | 28507 | sighted | light-heavy | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | light | heavy | 1.24 | 0.84 |
| 42976 | 28508 | sighted | relaxed-tense | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | relaxed | tense | -0.23 | 0.15 |
| 42977 | 28509 | sighted | alive-dead | sighted_68129 | red | 6 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | alive | dead | -0.90 | -0.93 |
| 42978 | 28510 | sighted | fast-slow | sighted_68129 | red | 3 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | fast | slow | 0.76 | 0.04 |
| 42979 | 28511 | sighted | high-low | sighted_68129 | red | 2 | replication_2 | other | 11.0 | 0.0 | 1.0 | -0.57 | -0.38 | -0.83 | -1.0 | -0.25 | high | low | 1.24 | 0.12 |
42980 rows × 20 columns
swow = pd.read_csv('../embeddings/SWOW-EN.R100.csv')
display(swow)
| Unnamed: 0 | id | participantID | age | gender | nativeLanguage | country | education | created_at | cue | R1 | R2 | R3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 29 | 3 | 33 | Fe | United States | Australia | NaN | 2011-08-12 02:19:38 | although | nevertheless | yet | but |
| 1 | 2 | 30 | 3 | 33 | Fe | United States | Australia | NaN | 2011-08-12 02:19:38 | deal | no | cards | shake |
| 2 | 3 | 31 | 3 | 33 | Fe | United States | Australia | NaN | 2011-08-12 02:19:38 | music | notes | band | rhythm |
| 3 | 4 | 32 | 3 | 33 | Fe | United States | Australia | NaN | 2011-08-12 02:19:38 | inform | tell | rat on | NaN |
| 4 | 5 | 33 | 3 | 33 | Fe | United States | Australia | NaN | 2011-08-12 02:19:38 | way | path | via | method |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1228195 | 1228196 | 1530300 | 132506 | 29 | Ma | Canada | Australia | 5.0 | 2018-08-10 01:56:27 | strange | mask | weird | stranger |
| 1228196 | 1228197 | 1530290 | 132506 | 29 | Ma | Canada | Australia | 5.0 | 2018-08-10 01:56:27 | sunset | sea | sky | clause |
| 1228197 | 1228198 | 1530291 | 132506 | 29 | Ma | Canada | Australia | 5.0 | 2018-08-10 01:56:27 | useless | pitty | worthless | worth |
| 1228198 | 1228199 | 1530284 | 132506 | 29 | Ma | Canada | Australia | 5.0 | 2018-08-10 01:56:27 | volume | loud | music | key |
| 1228199 | 1228200 | 1530288 | 132506 | 29 | Ma | Canada | Australia | 5.0 | 2018-08-10 01:56:27 | whenever | who | where | always |
1228200 rows × 13 columns
def add_swow(df, swow, colname):
swow = pd.DataFrame(swow.groupby('cue')['resp'].value_counts()).rename(columns={'resp': 'n'})
swow = swow.reset_index()
df = df.merge(swow, left_on=['word1', 'color'], right_on=['cue', 'resp'], how='left')
df = df.merge(swow, left_on=['word2', 'color'], right_on=['cue', 'resp'], how='left')
df['n_x'] = df['n_x'].fillna(0)
df['n_y'] = df['n_y'].fillna(0)
df[colname] = df['n_x'] - df['n_y']
df = df.drop(columns=[
'cue_x',
'cue_y',
'resp_x',
'resp_y',
'n_x',
'n_y',
])
return df
swow = swow[swow['cue'].isin(dimensions)]
swow_NZ = swow[(swow['country'] == 'New Zealand')] # select only NZ respondents
swow_US = swow[(swow['country'] == 'United States')] # select only US respondents
# count only R1 (maximal discounting)
df_joint = add_swow(df_joint, swow.rename(columns={'R1': 'resp'}), 'swow_R1')
df_joint = add_swow(df_joint, swow_NZ.rename(columns={'R1': 'resp'}), 'swow_R1_NZ') # US
df_joint = add_swow(df_joint, swow_US.rename(columns={'R1': 'resp'}), 'swow_R1_US') # NZ
# count R1, R2, and R3 with equal weight (minimal discounting)
swow_all = swow.melt(
id_vars=['id', 'participantID', 'created_at', 'cue'],
value_vars=['R1', 'R2', 'R3'],
value_name='resp',
)
df_joint = add_swow(df_joint, swow_all, 'swow_all')
# NZ
swow_all_NZ = swow_NZ.melt(
id_vars=['id', 'participantID', 'created_at', 'cue'],
value_vars=['R1', 'R2', 'R3'],
value_name='resp',
)
df_joint = add_swow(df_joint, swow_all_NZ, 'swow_all_NZ')
# US
swow_all_US = swow_US.melt(
id_vars=['id', 'participantID', 'created_at', 'cue'],
value_vars=['R1', 'R2', 'R3'],
value_name='resp',
)
df_joint = add_swow(df_joint, swow_all_US, 'swow_all_US')
display(df_joint)
| index | group | dimension | pp_id | color | rating | experiment | self_vs_other | art | fiction | ... | word1 | word2 | frequency | concreteness | swow_R1 | swow_R1_NZ | swow_R1_US | swow_all | swow_all_NZ | swow_all_US | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | sighted | cold-hot | sighted_1 | white | 1 | original | self | NaN | NaN | ... | cold | hot | -0.216432 | -0.46 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 1 | sighted | ripe-unripe | sighted_1 | white | 7 | original | self | NaN | NaN | ... | ripe | unripe | 3.485549 | -0.01 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 2 | sighted | new-old | sighted_1 | white | 1 | original | self | NaN | NaN | ... | new | old | 0.119068 | 0.09 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | 3 | sighted | submissive-aggressive | sighted_1 | white | 1 | original | self | NaN | NaN | ... | submissive | aggressive | -2.352148 | -0.82 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | 4 | sighted | selfless-jealous | sighted_1 | white | 1 | original | self | NaN | NaN | ... | selfless | jealous | -2.955968 | -0.56 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 42975 | 28507 | sighted | light-heavy | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | ... | light | heavy | 1.240142 | 0.84 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 42976 | 28508 | sighted | relaxed-tense | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | ... | relaxed | tense | -0.229652 | 0.15 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 42977 | 28509 | sighted | alive-dead | sighted_68129 | red | 6 | replication_2 | other | 11.0 | 0.0 | ... | alive | dead | -0.904786 | -0.93 | -1.0 | 0.0 | 0.0 | -1.0 | 0.0 | 0.0 |
| 42978 | 28510 | sighted | fast-slow | sighted_68129 | red | 3 | replication_2 | other | 11.0 | 0.0 | ... | fast | slow | 0.763262 | 0.04 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 42979 | 28511 | sighted | high-low | sighted_68129 | red | 2 | replication_2 | other | 11.0 | 0.0 | ... | high | low | 1.237676 | 0.12 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
42980 rows × 26 columns
# check how many participants gave green as a response to various cues (to use as an example in the paper)
counts = swow_all_US.groupby(['cue', 'resp']).count().reset_index()
display(counts[counts['resp'] == 'green'])
| cue | resp | id | participantID | created_at | variable | |
|---|---|---|---|---|---|---|
| 233 | alive | green | 1 | 1 | 1 | 1 |
| 508 | clean | green | 1 | 1 | 1 | 1 |
| 1108 | exciting | green | 1 | 1 | 1 | 1 |
| 1289 | fresh | green | 1 | 1 | 1 | 1 |
| 1456 | hard | green | 1 | 1 | 1 | 1 |
| 1706 | jealous | green | 20 | 20 | 20 | 20 |
| 1984 | new | green | 1 | 1 | 1 | 1 |
| 3010 | unripe | green | 18 | 18 | 18 | 18 |
display(df_joint.sort_values('swow_all'))
| index | group | dimension | pp_id | color | rating | experiment | self_vs_other | art | fiction | ... | word1 | word2 | frequency | concreteness | swow_R1 | swow_R1_NZ | swow_R1_US | swow_all | swow_all_NZ | swow_all_US | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 25920 | 11452 | sighted | selfless-jealous | sighted_68676 | green | 2 | replication_2 | self | 4.0 | 0.0 | ... | selfless | jealous | -2.955968 | -0.56 | -19.0 | 0.0 | -10.0 | -40.0 | -1.0 | -20.0 |
| 5138 | 242 | sighted | selfless-jealous | sighted_68736 | green | 2 | replication_1 | self | NaN | NaN | ... | selfless | jealous | -2.955968 | -0.56 | -19.0 | 0.0 | -10.0 | -40.0 | -1.0 | -20.0 |
| 37530 | 23062 | sighted | selfless-jealous | sighted_67653 | green | 6 | replication_2 | other | 10.0 | 4.0 | ... | selfless | jealous | -2.955968 | -0.56 | -19.0 | 0.0 | -10.0 | -40.0 | -1.0 | -20.0 |
| 25416 | 10948 | sighted | selfless-jealous | sighted_69192 | green | 7 | replication_2 | self | 9.0 | 1.0 | ... | selfless | jealous | -2.955968 | -0.56 | -19.0 | 0.0 | -10.0 | -40.0 | -1.0 | -20.0 |
| 16956 | 2488 | sighted | selfless-jealous | sighted_68719 | green | 5 | replication_2 | self | 3.0 | 0.0 | ... | selfless | jealous | -2.955968 | -0.56 | -19.0 | 0.0 | -10.0 | -40.0 | -1.0 | -20.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 466 | 466 | blind | clean-dirty | blind_8 | white | 2 | original | self | NaN | NaN | ... | clean | dirty | 0.600633 | -1.16 | 0.0 | 0.0 | 0.0 | 8.0 | 0.0 | 7.0 |
| 14129 | 9233 | sighted | clean-dirty | sighted_68738 | white | 1 | replication_1 | other | NaN | NaN | ... | clean | dirty | 0.600633 | -1.16 | 0.0 | 0.0 | 0.0 | 8.0 | 0.0 | 7.0 |
| 33057 | 18589 | sighted | light-heavy | sighted_68150 | white | 2 | replication_2 | other | 9.0 | 0.0 | ... | light | heavy | 1.240142 | 0.84 | 1.0 | 0.0 | 1.0 | 8.0 | 0.0 | 5.0 |
| 12790 | 7894 | sighted | clean-dirty | sighted_68946 | white | 1 | replication_1 | other | NaN | NaN | ... | clean | dirty | 0.600633 | -1.16 | 0.0 | 0.0 | 0.0 | 8.0 | 0.0 | 7.0 |
| 21537 | 7069 | sighted | light-heavy | sighted_67884 | white | 2 | replication_2 | self | 5.0 | 1.0 | ... | light | heavy | 1.240142 | 0.84 | 1.0 | 0.0 | 1.0 | 8.0 | 0.0 | 5.0 |
42980 rows × 26 columns
(It looks like there very few responses from NZ, but a little more from US and elsewhere.)
def get_cosine(x, vecs_dict):
return np.dot(norm(vecs_dict.get(x['word2'], 0) - vecs_dict.get(x['word1'], 0)), vecs_dict.get(x['color'], 0))
vecs = Vectors('../embeddings/cc.en.300.vec', n=2e5, d=300, normalize=True)
vecs_dict = vecs.as_dict()
df_joint['cosine_cc'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
[INFO] loading vectors ../embeddings/cc.en.300.vec [INFO] <function Vectors.__init__ at 0x7fb0731c9d40> ran in 15.058 seconds [INFO] <function Vectors.as_dict at 0x7fb0731c9f80> ran in 0.127 seconds
vecs = Vectors('../embeddings/subs.en.1e6.vec', n=2e5, d=300, normalize=True)
vecs_dict = vecs.as_dict()
df_joint['cosine_subs'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
[INFO] loading vectors ../embeddings/subs.en.1e6.vec [INFO] <function Vectors.__init__ at 0x7fb0731c9d40> ran in 15.389 seconds [INFO] <function Vectors.as_dict at 0x7fb0731c9f80> ran in 0.135 seconds
# academic
vecs = Vectors('../embeddings/acad.en.vec', n=2e5, d=300, normalize=True)
vecs_dict = vecs.as_dict()
df_joint['cosine_acad'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
# fiction
vecs = Vectors('../embeddings/fic.en.vec', n=2e5, d=300, normalize=True)
vecs_dict = vecs.as_dict()
df_joint['cosine_fic'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
# magazines
vecs = Vectors('../embeddings/mag.en.vec', n=2e5, d=300, normalize=True)
vecs_dict = vecs.as_dict()
df_joint['cosine_mag'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
# spoken
vecs = Vectors('../embeddings/spok.en.vec', n=2e5, d=300, normalize=True)
vecs_dict = vecs.as_dict()
df_joint['cosine_spok'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
# news
vecs = Vectors('../embeddings/news.en.vec', n=2e5, d=300, normalize=True)
vecs_dict = vecs.as_dict()
df_joint['cosine_news'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
display(df_joint.round(2))
[INFO] loading vectors ../embeddings/acad.en.vec [INFO] <function Vectors.__init__ at 0x7fb0731c9d40> ran in 16.646 seconds [INFO] <function Vectors.as_dict at 0x7fb0731c9f80> ran in 0.151 seconds [INFO] loading vectors ../embeddings/fic.en.vec [INFO] <function Vectors.__init__ at 0x7fb0731c9d40> ran in 16.292 seconds [INFO] <function Vectors.as_dict at 0x7fb0731c9f80> ran in 0.131 seconds [INFO] loading vectors ../embeddings/mag.en.vec [INFO] <function Vectors.__init__ at 0x7fb0731c9d40> ran in 15.646 seconds [INFO] <function Vectors.as_dict at 0x7fb0731c9f80> ran in 0.142 seconds [INFO] loading vectors ../embeddings/spok.en.vec [INFO] <function Vectors.__init__ at 0x7fb0731c9d40> ran in 15.418 seconds [INFO] <function Vectors.as_dict at 0x7fb0731c9f80> ran in 0.136 seconds [INFO] loading vectors ../embeddings/news.en.vec [INFO] <function Vectors.__init__ at 0x7fb0731c9d40> ran in 16.070 seconds [INFO] <function Vectors.as_dict at 0x7fb0731c9f80> ran in 0.155 seconds
| index | group | dimension | pp_id | color | rating | experiment | self_vs_other | art | fiction | ... | swow_all | swow_all_NZ | swow_all_US | cosine_cc | cosine_subs | cosine_acad | cosine_fic | cosine_mag | cosine_spok | cosine_news | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | sighted | cold-hot | sighted_1 | white | 1 | original | self | NaN | NaN | ... | 0.0 | 0.0 | 0.0 | 0.05 | 0.02 | 0.03 | -0.03 | -0.04 | 0.07 | 0.00 |
| 1 | 1 | sighted | ripe-unripe | sighted_1 | white | 7 | original | self | NaN | NaN | ... | 0.0 | 0.0 | 0.0 | 0.02 | 0.09 | 0.11 | 0.05 | 0.08 | -0.17 | 0.09 |
| 2 | 2 | sighted | new-old | sighted_1 | white | 1 | original | self | NaN | NaN | ... | 0.0 | 0.0 | 0.0 | 0.12 | 0.04 | 0.16 | 0.07 | 0.08 | 0.06 | 0.05 |
| 3 | 3 | sighted | submissive-aggressive | sighted_1 | white | 1 | original | self | NaN | NaN | ... | 0.0 | 0.0 | 0.0 | -0.08 | -0.08 | -0.05 | -0.07 | -0.06 | -0.06 | -0.07 |
| 4 | 4 | sighted | selfless-jealous | sighted_1 | white | 1 | original | self | NaN | NaN | ... | 0.0 | 0.0 | 0.0 | 0.05 | -0.01 | -0.00 | -0.04 | 0.03 | 0.04 | 0.11 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 42975 | 28507 | sighted | light-heavy | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | -0.16 | -0.04 | -0.07 | -0.10 | -0.14 | -0.13 | -0.05 |
| 42976 | 28508 | sighted | relaxed-tense | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.11 | -0.04 | 0.08 | -0.03 | -0.01 | 0.01 | 0.01 |
| 42977 | 28509 | sighted | alive-dead | sighted_68129 | red | 6 | replication_2 | other | 11.0 | 0.0 | ... | -1.0 | 0.0 | 0.0 | 0.15 | 0.06 | 0.04 | 0.07 | -0.03 | 0.07 | 0.06 |
| 42978 | 28510 | sighted | fast-slow | sighted_68129 | red | 3 | replication_2 | other | 11.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.00 | -0.02 | -0.06 | -0.01 | -0.06 | 0.01 | -0.02 |
| 42979 | 28511 | sighted | high-low | sighted_68129 | red | 2 | replication_2 | other | 11.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | -0.01 | 0.04 | -0.03 | -0.01 | 0.03 | 0.06 | 0.04 |
42980 rows × 33 columns
COCA embeddings, but from COCA corpora without sentences with 1st order cooccurrences (sentences with a color word and a dimension word).
def get_cosine(x, vecs_dict):
zero = np.zeros(300)
return np.dot(norm(vecs_dict.get(x['word1'], zero) - vecs_dict.get(x['word2'], zero)), vecs_dict.get(x['color'], zero))
# fiction
vecs = Vectors('../embeddings/fic.filtered.en.vec', n=2e5, d=300, normalize=True)
vecs_dict = vecs.as_dict()
df_joint['cosine_fic_filtered'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
[INFO] loading vectors ../embeddings/fic.filtered.en.vec [INFO] <function Vectors.__init__ at 0x7fb0731c9d40> ran in 16.957 seconds [INFO] <function Vectors.as_dict at 0x7fb0731c9f80> ran in 0.138 seconds
COCA embeddings, but from training corpora from which the 100 nearest neighbors of each dimension word have been removed (in an attempt to disrupt the "scaffolding" that semantic associations with the dimension words are built on).
# fiction
vecs = Vectors('../embeddings/fic.noneighbors.en.vec', n=2e5, d=300, normalize=True)
vecs_dict = vecs.as_dict()
df_joint['cosine_fic_noneighbors'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1).fillna(0)
[INFO] loading vectors ../embeddings/fic.noneighbors.en.vec [INFO] <function Vectors.__init__ at 0x7fb0731c9d40> ran in 15.297 seconds [INFO] <function Vectors.as_dict at 0x7fb0731c9f80> ran in 0.130 seconds
COCA embeddings, but from training corpora from which the labels generated by at least two participants for color-semantic associations (e.g. the label snow for the combination white and cold) has been removed. (These nameability data are explored in more detail in a section at the end of this notebook.)
# fiction
vecs = Vectors('../embeddings/fic.nonames.en.vec', n=2e5, d=300, normalize=True)
vecs_dict = vecs.as_dict()
df_joint['cosine_fic_nonames'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1).fillna(0)
[INFO] loading vectors ../embeddings/fic.nonames.en.vec [INFO] <function Vectors.__init__ at 0x7fb0731c9d40> ran in 14.704 seconds [INFO] <function Vectors.as_dict at 0x7fb0731c9f80> ran in 0.139 seconds
df_orig = df_joint[df_joint['experiment'] == 'original']
corrs = np.abs(df_joint[[
'rating',
'cosine_cc',
'cosine_subs',
'cosine_fic',
'cosine_fic_filtered',
'cosine_fic_noneighbors',
'cosine_fic_nonames',
'swow_all',
'swow_all_NZ',
'swow_all_US',
'frequency',
'concreteness',
]].corr()).round(2)
g = sns.heatmap(corrs, vmin=0, vmax=1, annot=True)
df_rep = df_joint[df_joint['experiment'] == 'replication_2']
corrs = np.abs(df_joint[[
'rating',
'cosine_cc',
'cosine_subs',
'cosine_fic',
'cosine_fic_filtered',
'cosine_fic_noneighbors',
'cosine_fic_nonames',
'swow_all',
'swow_all_NZ',
'swow_all_US',
'frequency',
'concreteness',
]].corr()).round(2)
g = sns.heatmap(corrs, vmin=0, vmax=1, annot=True)
corrs = np.abs(df_joint[[
'rating',
'cosine_cc',
'cosine_subs',
'cosine_fic',
'cosine_fic_filtered',
'cosine_fic_noneighbors',
'cosine_fic_nonames',
'swow_all',
'swow_all_NZ',
'swow_all_US',
'frequency',
'concreteness',
]].corr()).round(2)
g = sns.heatmap(corrs, vmin=0, vmax=1, annot=True)
def standardize(Series):
return (Series - Series.mean()) / Series.std()
df_joint['art_z'] = standardize(df_joint['art'])
df_joint['fiction_z'] = standardize(df_joint['fiction'])
df_joint['nonfiction_z'] = standardize(df_joint['nonfiction'])
df_joint['reading_motivation_z'] = standardize(df_joint['reading_motivation'])
df_joint['reading_part_of_self_z'] = standardize(df_joint['reading_part_of_self'])
df_joint['reading_efficacy_z'] = standardize(df_joint['reading_efficacy'])
df_joint['reading_recognition_z'] = standardize(df_joint['reading_recognition'])
df_joint['reading_other_realms_z'] = standardize(df_joint['reading_other_realms'])
df_joint['rating_z'] = standardize(df_joint['rating'])
df_joint['frequency_z'] = standardize(df_joint['frequency'])
df_joint['concreteness_z'] = standardize(df_joint['concreteness'])
df_joint['swow_all_z'] = standardize(df_joint['swow_all'])
df_joint['swow_all_NZ_z'] = standardize(df_joint['swow_all_NZ'])
df_joint['swow_all_US_z'] = standardize(df_joint['swow_all_US'])
df_joint['swow_R1_z'] = standardize(df_joint['swow_R1'])
df_joint['swow_R1_NZ_z'] = standardize(df_joint['swow_R1_NZ'])
df_joint['swow_R1_US_z'] = standardize(df_joint['swow_R1_US'])
df_joint['cosine_cc_z'] = standardize(df_joint['cosine_cc'])
df_joint['cosine_subs_z'] = standardize(df_joint['cosine_subs'])
df_joint['cosine_acad_z'] = standardize(df_joint['cosine_acad'])
df_joint['cosine_fic_z'] = standardize(df_joint['cosine_fic'])
df_joint['cosine_mag_z'] = standardize(df_joint['cosine_mag'])
df_joint['cosine_news_z'] = standardize(df_joint['cosine_news'])
df_joint['cosine_spok_z'] = standardize(df_joint['cosine_spok'])
df_joint['cosine_fic_filtered_z'] = standardize(df_joint['cosine_fic_filtered'])
df_joint['cosine_fic_noneighbors_z'] = standardize(df_joint['cosine_fic_noneighbors'])
df_joint['cosine_fic_nonames_z'] = standardize(df_joint['cosine_fic_nonames'])
df_joint['blind'] = pd.get_dummies(df_joint['group'])['blind']
df_joint['sighted'] = pd.get_dummies(df_joint['group'])['sighted']
df_joint['group_eff'] = (df_joint['sighted'] - .5) * 2
df_joint['group_z'] = standardize(df_joint['sighted'])
df_joint['original'] = pd.get_dummies(df_joint['experiment'])['original']
df_joint['replication_1'] = pd.get_dummies(df_joint['experiment'])['replication_1']
df_joint['replication_2'] = pd.get_dummies(df_joint['experiment'])['replication_2']
#df_joint['experiment_eff'] = (df_joint['replication'] - .5) * 2
#df_joint['experiment_z'] = standardize(df_joint['replication'])
df_joint['other'] = pd.get_dummies(df_joint['self_vs_other'])['other']
df_joint['self'] = pd.get_dummies(df_joint['self_vs_other'])['self']
df_joint['self_vs_other_eff'] = (df_joint['other'] - .5) * 2
df_joint['self_vs_other_z'] = standardize(df_joint['other'])
df_joint.to_csv('data_plus_predictors.tsv', sep='\t', index=False)
display(df_joint)
| index | group | dimension | pp_id | color | rating | experiment | self_vs_other | art | fiction | ... | sighted | group_eff | group_z | original | replication_1 | replication_2 | other | self | self_vs_other_eff | self_vs_other_z | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | sighted | cold-hot | sighted_1 | white | 1 | original | self | NaN | NaN | ... | 1 | 1.0 | 0.211241 | 1 | 0 | 0 | 0 | 1 | -1.0 | -0.891882 |
| 1 | 1 | sighted | ripe-unripe | sighted_1 | white | 7 | original | self | NaN | NaN | ... | 1 | 1.0 | 0.211241 | 1 | 0 | 0 | 0 | 1 | -1.0 | -0.891882 |
| 2 | 2 | sighted | new-old | sighted_1 | white | 1 | original | self | NaN | NaN | ... | 1 | 1.0 | 0.211241 | 1 | 0 | 0 | 0 | 1 | -1.0 | -0.891882 |
| 3 | 3 | sighted | submissive-aggressive | sighted_1 | white | 1 | original | self | NaN | NaN | ... | 1 | 1.0 | 0.211241 | 1 | 0 | 0 | 0 | 1 | -1.0 | -0.891882 |
| 4 | 4 | sighted | selfless-jealous | sighted_1 | white | 1 | original | self | NaN | NaN | ... | 1 | 1.0 | 0.211241 | 1 | 0 | 0 | 0 | 1 | -1.0 | -0.891882 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 42975 | 28507 | sighted | light-heavy | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | ... | 1 | 1.0 | 0.211241 | 0 | 0 | 1 | 1 | 0 | 1.0 | 1.121199 |
| 42976 | 28508 | sighted | relaxed-tense | sighted_68129 | red | 5 | replication_2 | other | 11.0 | 0.0 | ... | 1 | 1.0 | 0.211241 | 0 | 0 | 1 | 1 | 0 | 1.0 | 1.121199 |
| 42977 | 28509 | sighted | alive-dead | sighted_68129 | red | 6 | replication_2 | other | 11.0 | 0.0 | ... | 1 | 1.0 | 0.211241 | 0 | 0 | 1 | 1 | 0 | 1.0 | 1.121199 |
| 42978 | 28510 | sighted | fast-slow | sighted_68129 | red | 3 | replication_2 | other | 11.0 | 0.0 | ... | 1 | 1.0 | 0.211241 | 0 | 0 | 1 | 1 | 0 | 1.0 | 1.121199 |
| 42979 | 28511 | sighted | high-low | sighted_68129 | red | 2 | replication_2 | other | 11.0 | 0.0 | ... | 1 | 1.0 | 0.211241 | 0 | 0 | 1 | 1 | 0 | 1.0 | 1.121199 |
42980 rows × 74 columns
def get_cosine_1word(x, vecs_dict):
zero = np.zeros(300)
return np.dot(vecs_dict.get(x['dimension'], zero), vecs_dict.get(x['color'], zero))
# fiction
vecs = Vectors('../embeddings/fic.en.vec', n=2e5, d=300, normalize=True)
vecs_dict = vecs.as_dict()
df_names = pd.read_csv('color_dimension_nameability.csv')
display(df_names.head())
[INFO] loading vectors ../embeddings/fic.en.vec [INFO] <function Vectors.__init__ at 0x7fb0731c9d40> ran in 17.838 seconds [INFO] <function Vectors.as_dict at 0x7fb0731c9f80> ran in 1.847 seconds
| prompt | dimension | color | number_responses | avg_words_per_response | percent_unique_words | percent_unique_lemmas | simpson_diversity | modal_agreement | modal_names | modal_response_agreement | modal_response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | happy_brown | happy | brown | 10 | 1.000000 | 0.800000 | 0.800000 | 0.044444 | 0.200000 | cat,puppy | 0.200000 | cat,puppy |
| 1 | unripe_brown | unripe | brown | 10 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.100000 | grape,bannana,kiwi,avocado,pear,fruit,tree,coc... | 0.100000 | grape,bannana,kiwi,avocado,pear,fruit,tree,coc... |
| 2 | hard_brown | hard | brown | 10 | 1.000000 | 0.900000 | 0.800000 | 0.044444 | 0.200000 | wood,rock | 0.200000 | wood |
| 3 | angry_blue | angry | blue | 13 | 1.076923 | 0.714286 | 0.714286 | 0.054945 | 0.230769 | shark | 0.230769 | shark |
| 4 | sad_brown | sad | brown | 10 | 1.100000 | 0.909091 | 0.909091 | 0.018182 | 0.200000 | cat | 0.200000 | cat |
# check how many participants provided labels for each color-adjective pair
print(df_names['number_responses'].min())
print(df_names['number_responses'].max())
display(df_names.sort_values('modal_agreement'))
df_names.sort_values('modal_agreement').to_csv('mean_nameability.tsv', sep='\t', index=False)
7 13
| prompt | dimension | color | number_responses | avg_words_per_response | percent_unique_words | percent_unique_lemmas | simpson_diversity | modal_agreement | modal_names | modal_response_agreement | modal_response | cosine_fic | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 99 | liked_blue | liked | blue | 13 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.076923 | sonic,sky,bird,pigeon,phone,smurfs,pencil,colo... | 0.076923 | sonic,sky,bird,pigeon,phone,smurfs,pencil,colo... | 0.093598 |
| 40 | relaxed_blue | relaxed | blue | 13 | 1.076923 | 1.000000 | 1.000000 | 0.000000 | 0.076923 | smurfette,meditation,bird,water,tranquility,st... | 0.076923 | smurfette,meditation,bird,water,tranquility,st... | 0.116994 |
| 30 | submissive_blue | submissive | blue | 13 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.076923 | macaw,nun,bird,swallow,butterfly,flower,door,b... | 0.076923 | macaw,nun,bird,swallow,butterfly,flowers,door,... | 0.107565 |
| 91 | old_blue | old | blue | 13 | 1.076923 | 1.000000 | 1.000000 | 0.000000 | 0.076923 | bluecheese,necklace,bird,dress,shoe,smurfs,rug... | 0.076923 | bluecheese,necklace,bird,dress,shoes,smurfs,ru... | 0.239095 |
| 192 | clean_yellow | clean | yellow | 12 | 1.083333 | 1.000000 | 1.000000 | 0.000000 | 0.083333 | table,detergant,sun,glove,hat,flag,ford,mustan... | 0.083333 | table,detergant,sun,gloves,hat,flag,ford.musta... | 0.203213 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 151 | clean_white | clean | white | 9 | 1.111111 | 0.700000 | 0.600000 | 0.222222 | 0.555556 | sheet | 0.333333 | sheets | 0.351288 |
| 170 | ripe_yellow | ripe | yellow | 12 | 1.000000 | 0.500000 | 0.500000 | 0.318182 | 0.583333 | banana | 0.583333 | banana | 0.235731 |
| 68 | cold_blue | cold | blue | 13 | 1.000000 | 0.461538 | 0.461538 | 0.358974 | 0.615385 | ice | 0.615385 | ice | 0.319919 |
| 147 | cold_white | cold | white | 9 | 1.000000 | 0.333333 | 0.333333 | 0.583333 | 0.777778 | snow | 0.777778 | snow | 0.283547 |
| 158 | stale_white | stale | white | 9 | 1.000000 | 0.222222 | 0.222222 | 0.777778 | 0.888889 | bread | 0.888889 | bread | 0.244807 |
306 rows × 13 columns
names = df_names['modal_names']
names = list(chain(*[name.split(',') for name in names]))
names_all = set(names) # all unique names
names_count = Counter(names)
names_2plus = [name[0] for name in names_count.most_common() if name[1] >= 2] # all names that occur 2+ times
print(f'Number of labels named by at least 2 participants: {len(names_2plus)}')
with open('names_all.txt', 'w') as namesfile:
namesfile.write('\n'.join(names_all))
with open('names_2plus.txt', 'w') as namesfile:
namesfile.writelines('\n'.join(names_2plus))
# let's ignore words like "me", "my", and "a" though
Number of labels named by at least 2 participants: 242
Since we only have nameability for colors and dimension axis poles (i.e. for yellow and dislike but not yellow and dislike-like), we correlate nameability measures with cosine similarity between color and dimension axis pole.
pearsonr(df_names['simpson_diversity'], df_names['modal_agreement'])
(0.8947743710654127, 1.8167397467075646e-108)
df_names['cosine_fic'] = df_names.apply(lambda x: get_cosine_1word(x, vecs_dict), axis=1)
display(df_names.head())
x = pearsonr(df_names['cosine_fic'], df_names['simpson_diversity'])
print(f'pearsonr(cosine_fiction, simpson_diversity): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_names['cosine_fic'], df_names['modal_agreement'])
print(f'pearsonr(cosine_fiction, modal_agreement): {x[0]:.3f}, p-value: {x[1]:.3f}')
g = sns.lmplot(x='cosine_fic', y='simpson_diversity', data=df_names)
g = sns.lmplot(x='cosine_fic', y='modal_agreement', data=df_names)
| prompt | dimension | color | number_responses | avg_words_per_response | percent_unique_words | percent_unique_lemmas | simpson_diversity | modal_agreement | modal_names | modal_response_agreement | modal_response | cosine_fic | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | happy_brown | happy | brown | 10 | 1.000000 | 0.800000 | 0.800000 | 0.044444 | 0.200000 | cat,puppy | 0.200000 | cat,puppy | 0.105870 |
| 1 | unripe_brown | unripe | brown | 10 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.100000 | grape,bannana,kiwi,avocado,pear,fruit,tree,coc... | 0.100000 | grape,bannana,kiwi,avocado,pear,fruit,tree,coc... | 0.322344 |
| 2 | hard_brown | hard | brown | 10 | 1.000000 | 0.900000 | 0.800000 | 0.044444 | 0.200000 | wood,rock | 0.200000 | wood | 0.211326 |
| 3 | angry_blue | angry | blue | 13 | 1.076923 | 0.714286 | 0.714286 | 0.054945 | 0.230769 | shark | 0.230769 | shark | 0.174112 |
| 4 | sad_brown | sad | brown | 10 | 1.100000 | 0.909091 | 0.909091 | 0.018182 | 0.200000 | cat | 0.200000 | cat | 0.228968 |
pearsonr(cosine_fiction, simpson_diversity): 0.214, p-value: 0.000 pearsonr(cosine_fiction, modal_agreement): 0.202, p-value: 0.000
Since we do not have human ratings for the association between colors and dimension axis poles (only for association between colors and dimension axes), we need to collapse our nameability measures for the two poles of each dimension axis. One way to do this is to compute difference scores.
df_sighted = df_joint.loc[df_joint['group'] == 'sighted']
df_sighted['diversity_word1'] = df_sighted.merge(df_names, how='left', left_on=['word1', 'color'], right_on=['dimension', 'color'])['simpson_diversity']
df_sighted['diversity_word2'] = df_sighted.merge(df_names, how='left', left_on=['word2', 'color'], right_on=['dimension', 'color'])['simpson_diversity']
df_sighted['agreement_word1'] = df_sighted.merge(df_names, how='left', left_on=['word1', 'color'], right_on=['dimension', 'color'])['modal_agreement']
df_sighted['agreement_word2'] = df_sighted.merge(df_names, how='left', left_on=['word2', 'color'], right_on=['dimension', 'color'])['modal_agreement']
df_sighted['diff_diversity'] = (df_sighted['diversity_word1'] - df_sighted['diversity_word2'])
df_sighted['diff_agreement'] = (df_sighted['agreement_word1'] - df_sighted['agreement_word2'])
df_sighted = df_sighted.dropna()
display(df_sighted.head())
df_mean_sighted = df_sighted.groupby(['color', 'dimension', 'word1', 'word2']).mean().reset_index()
df_sd_sighted = df_sighted.groupby(['color', 'dimension', 'word1', 'word2']).std().reset_index()
x = pearsonr(df_mean_sighted['rating'], df_mean_sighted['diff_diversity'])
print(f'pearsonr(rating, simpson_diversity_difference): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_mean_sighted['rating'], df_mean_sighted['diff_agreement'])
print(f'pearsonr(rating, modal_agreement_difference): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_mean_sighted['cosine_fic'], df_mean_sighted['diff_diversity'])
print(f'pearsonr(cosine_fiction, simpson_diversity_difference): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_mean_sighted['cosine_fic'], df_mean_sighted['diff_agreement'])
print(f'pearsonr(cosine_fiction, modal_agreement_difference): {x[0]:.3f}, p-value: {x[1]:.3f}')
| index | group | dimension | pp_id | color | rating | experiment | self_vs_other | art | fiction | ... | other | self | self_vs_other_eff | self_vs_other_z | diversity_word1 | diversity_word2 | agreement_word1 | agreement_word2 | diff_diversity | diff_agreement | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 14468 | 0 | sighted | cold-hot | sighted_69212 | brown | 4 | replication_2 | self | 3.0 | 0.0 | ... | 0 | 1 | -1.0 | -0.891882 | 0.000000 | 0.012821 | 0.142857 | 0.285714 | -0.012821 | -0.142857 |
| 14469 | 1 | sighted | ripe-unripe | sighted_69212 | brown | 7 | replication_2 | self | 3.0 | 0.0 | ... | 0 | 1 | -1.0 | -0.891882 | 0.238095 | 0.035714 | 0.428571 | 0.285714 | 0.202381 | 0.142857 |
| 14470 | 2 | sighted | new-old | sighted_69212 | brown | 6 | replication_2 | self | 3.0 | 0.0 | ... | 0 | 1 | -1.0 | -0.891882 | 0.000000 | 0.000000 | 0.142857 | 0.142857 | 0.000000 | 0.000000 |
| 14471 | 3 | sighted | submissive-aggressive | sighted_69212 | brown | 2 | replication_2 | self | 3.0 | 0.0 | ... | 0 | 1 | -1.0 | -0.891882 | 0.000000 | 0.000000 | 0.142857 | 0.142857 | 0.000000 | 0.000000 |
| 14472 | 4 | sighted | selfless-jealous | sighted_69212 | brown | 5 | replication_2 | self | 3.0 | 0.0 | ... | 0 | 1 | -1.0 | -0.891882 | 0.000000 | 0.044444 | 0.142857 | 0.285714 | -0.044444 | -0.142857 |
5 rows × 80 columns
pearsonr(rating, simpson_diversity_difference): 0.036, p-value: 0.666 pearsonr(rating, modal_agreement_difference): -0.012, p-value: 0.890 pearsonr(cosine_fiction, simpson_diversity_difference): 0.058, p-value: 0.492 pearsonr(cosine_fiction, modal_agreement_difference): 0.059, p-value: 0.482
df_mean_sighted['rating_sd'] = df_sd_sighted['rating']
g = sns.lmplot(x='rating_sd', y='diff_diversity', data=df_mean_sighted)
x = pearsonr(df_mean_sighted['rating_sd'], df_mean_sighted['diff_diversity'])
print(f'pearsonr(rating_sd, simpson_diversity): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_mean_sighted['rating_sd'], df_mean_sighted['diff_diversity'])
print(f'pearsonr(rating_sd, simpson_diversity): {x[0]:.3f}, p-value: {x[1]:.3f}')
pearsonr(rating_sd, simpson_diversity): 0.280, p-value: 0.001
One other way to work around the issue of having only color to dimension axis pole nameability is to split and invert the human ratings of color-dimension axis associations to create two scores per rating: One for the right end of the axis (equal to the rating), and one for the left end of the axis (equal to eight minus the rating). For example: If yellow is assigned a 6 on the scale dislike-like, the rating for yellow/like is 6, but we also create a rating of 2 for yellow/dislike.
df_inverse = df_sighted[[
'color',
'word1',
'rating',
'diversity_word1',
'agreement_word1'
]].rename(columns={
'word1': 'dimension',
'diversity_word1': 'simpson_diversity',
'agreement_word1': 'modal_agreement'
})
df_inverse['rating'] = 8 - df_inverse['rating']
df_inverse = pd.concat([df_inverse, df_sighted[[
'color',
'word2',
'rating',
'diversity_word2',
'agreement_word2'
]].rename(columns={
'word2': 'dimension',
'diversity_word2': 'simpson_diversity',
'agreement_word2': 'modal_agreement'
})])
display(df_inverse)
df_mean_inverse = df_inverse.groupby(['color', 'dimension']).mean().reset_index()
df_sd_inverse = df_inverse.groupby(['color', 'dimension']).std().reset_index()
x = pearsonr(df_mean_inverse['rating'], df_mean_inverse['simpson_diversity'])
print(f'pearsonr(rating, simpson_diversity): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_mean_inverse['rating'], df_mean_inverse['modal_agreement'])
print(f'pearsonr(rating, modal_agreement): {x[0]:.3f}, p-value: {x[1]:.3f}')
| color | dimension | rating | simpson_diversity | modal_agreement | |
|---|---|---|---|---|---|
| 14468 | brown | cold | 4 | 0.000000 | 0.142857 |
| 14469 | brown | ripe | 1 | 0.238095 | 0.428571 |
| 14470 | brown | new | 2 | 0.000000 | 0.142857 |
| 14471 | brown | submissive | 6 | 0.000000 | 0.142857 |
| 14472 | brown | selfless | 3 | 0.000000 | 0.142857 |
| ... | ... | ... | ... | ... | ... |
| 41138 | yellow | hard | 2 | 0.000000 | 0.125000 |
| 41139 | yellow | heavy | 2 | 0.000000 | 0.125000 |
| 41140 | yellow | tense | 2 | 0.000000 | 0.125000 |
| 41141 | yellow | dead | 2 | 0.000000 | 0.125000 |
| 41142 | yellow | slow | 2 | 0.000000 | 0.125000 |
46272 rows × 5 columns
pearsonr(rating, simpson_diversity): 0.062, p-value: 0.293 pearsonr(rating, modal_agreement): 0.070, p-value: 0.237
df_mean_inverse['rating_sd'] = df_sd_inverse['rating']
g = sns.lmplot(x='rating_sd', y='modal_agreement', data=df_mean_inverse)
g = sns.lmplot(x='rating_sd', y='simpson_diversity', data=df_mean_inverse)
x = pearsonr(df_mean_inverse['rating_sd'], df_mean_inverse['simpson_diversity'])
print(f'pearsonr(rating_sd, simpson_diversity): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_mean_inverse['rating_sd'], df_mean_inverse['modal_agreement'])
print(f'pearsonr(rating_sd, modal_agreement): {x[0]:.3f}, p-value: {x[1]:.3f}')
pearsonr(rating_sd, simpson_diversity): 0.228, p-value: 0.000 pearsonr(rating_sd, modal_agreement): 0.228, p-value: 0.000
In short: nameability (measured as simpson diversity and name agreement for the modal name) is weakly correlated with cosine similarity between colors and dimension axis poles, but not with human ratings, regardless of whether we fit the nameability to the ratings (by computing difference scores for the nameability measures) or fit the ratings to the nameability (by computing inverse ratings for the left poles of the dimension axes).
# reload COCA-fiction vecs
vecs = Vectors('../embeddings/fic.filtered.en.vec', n=1e6, d=300, normalize=True)
vecs_dict = vecs.as_dict()
color_vecs = filter_vecs(vecs, np.array(colors))
dimension_vecs = filter_vecs(vecs, np.array(dimensions))
dimension_pair_vecs = np.vstack([vecs_dict[pair[0]] - vecs_dict[pair[1]] for pair in dimension_pairs])
[INFO] loading vectors ../embeddings/fic.filtered.en.vec [INFO] <function Vectors.__init__ at 0x7fb0731c9d40> ran in 17.975 seconds [INFO] <function Vectors.as_dict at 0x7fb0731c9f80> ran in 0.153 seconds
Filtered 232340 vectors, 9 remaining.
[INFO] <function filter_vecs at 0x7fb0734b87a0> ran in 0.616 seconds
Filtered 232340 vectors, 34 remaining.
[INFO] <function filter_vecs at 0x7fb0734b87a0> ran in 0.801 seconds
dimension_neighbors = compute_nn(vecs, dimension_pair_vecs, np.array(dimension_labels), num_neighbors=100, whole_matrix=True)
dimension_neighbors = dimension_neighbors.rename(columns={'target': 'dimension'})
display(dimension_neighbors)
dimension_neighbors.to_csv('100_neighbors_coca_fic.tsv', sep='\t', index=False)
[INFO] <function Vectors.as_dict at 0x7fb0731c9f80> ran in 0.145 seconds [INFO] computing analogies using whole matrix additive method [INFO] <function compute_nn at 0x7fb0731c9cb0> ran in 0.951 seconds
| dimension | neighbor 0 | neighbor 1 | neighbor 2 | neighbor 3 | neighbor 4 | neighbor 5 | neighbor 6 | neighbor 7 | neighbor 8 | ... | neighbor -10 | neighbor -9 | neighbor -8 | neighbor -7 | neighbor -6 | neighbor -5 | neighbor -4 | neighbor -3 | neighbor -2 | neighbor -1 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | cold-hot | cold | coldness | chill | icy | chilling | icy_cold | chilled | colder | chilly | ... | burnin | Bellissimo | spics | hotting | hotspots | hottest | hotter | hotspot | Hot | hot |
| 1 | ripe-unripe | ripe | Forties | Belmont | Intimidating | riper | Corrupt | Proxy | high_maintenance | Facilities | ... | kopf | tablets | uncap | unread | hoar | unsay | unrisen | unre | uns | unripe |
| 2 | new-old | new | fresh | new_batch | changes | rethink | redefining | sequencing | newly | newest | ... | old_duffer | oldtimer | ancient | gnarled | Old | older | old_hag | old_farts | old_fart | old |
| 3 | submissive-aggressive | submissively | submissive | submission | submissions | submissiveness | Submission | Daughter_Lord | submit | submits | ... | aggressor | aggressiveness | aggro | Aggressive | aggression | unaggressive | aggressions | aggres | aggressively | aggressive |
| 4 | selfless-jealous | selfless | selflessly | selflessness | selfless_act | ethos | wolfless_men | self_regard | self_denial | selfserving | ... | snoop_around | jealous_rage | upset | jealousies | Jealous | jealous_type | jealousy | jeal | jealously | jealous |
| 5 | active-passive | active | activ | active_duty | actively | health_clubs | Glacier_Bay | Inactive | fund_raising_event | fund_raising | ... | Passive | impassively | graceless | impassivity | submissiveness | passivity | impassive | Impassive | passively | passive |
| 6 | like-dislike | like | Like | Titty_Twister | Feels_like | raggedy_ass | Stelly | horror_movie | papier_mch | B_movie | ... | broad_mindedness | antagonism | an_instant_dislike | intense_dislike | Dislike | mislike | disliking | disliked | dislikes | dislike |
| 7 | clean-dirty | clean | cleaned | cleancut | sterilize | reasonably | sterilized | cleanser | cleanses | cleans | ... | dirty_slush | muddy | scurrilous | dirty_jokes | grimy | filthy | grubby | irty | Dirty | dirty |
| 8 | fresh-stale | fresh | afresh | freshly | Fresh | newly | new | clean | freshly_cut | refresh | ... | stale_odor | stale_tobacco | stalemated | stale_cigarettes | staleness | stale_smoke | stale_air | stale_beer | stale_cigarette_smoke | stale |
| 9 | calm-angry | calm | calming | serene | calms | cool | some_semblance | Breathe_deep | soothing | calmness | ... | humiliated | enraged | disgusted | furious | unhappy | angry_hornets | angrier | angry_mob | Angry | angry |
| 10 | happy-sad | happy | happ | blissfully_happy | perfectly_happy | oblige | supremely_happy | delighted | deliriously_happy | thrilled | ... | woebegone | forlorn | sadness | sorrowful | wifeless | sadder | lifeless | mournfulness | mournful | sad |
| 11 | exciting-dull | exciting | Exciting | excited | excitingly | most_exciting | interesting | unexciting | excite | terribly_exciting | ... | dulling | dull_throbbing | duller | dull_sheen | dull_eyed | dully | dulled | dull_dull | dulls | dull |
| 12 | soft-hard | soft | soft_murmur | soft_moan | purr | rustle | mewing | throaty | whispery | silky | ... | dif_ficult | hardest | Hard | Difficult | difficult | centrate | harder_than | proving_difficult | harder | hard |
| 13 | light-heavy | light | illumination | dawnlight | lights | sunlight | wan_light | halogen_lights | illumine | illumi | ... | heavily_laden | heavy_wool | eavy | heavies | hefty | heavy_load | laden | thick | Heavy | heavy |
| 14 | relaxed-tense | relaxed | relaxer | relaxing | relax | Relaxed | eased | Contented | muscles_relaxed | contented | ... | final_confrontation | desperate_straits | Intense | Tense | nervous | accusatory | fearful | tenser | tenses | tense |
| 15 | alive-dead | alive | pleasurably | 58_9 | stay_alive | aloof | under_control | hardly_contain | marvelously | could_hardly_contain | ... | empty_beer_cans | muddy_ditch | fallen | Dead | graves | laundryman | grave | Dropped | deader | dead |
| 16 | fast-slow | fast | quickly | Quickly | Ouickly | rapidly | faster_faster | Faster_faster | soon | easily | ... | mesmerizing | deliberate | cadenced | quiescence | unremitting_courtesy | slow_drawl | languor | languorous | gentle | slow |
17 rows × 201 columns
df_viz = df_joint[df_joint['dimension'] != 'high-low']
df_means = df_viz.groupby(['dimension', 'color', 'word1', 'word2']).mean().reset_index()
dim_order = df_means.groupby('dimension').std().sort_values('rating', ascending=False).reset_index()['dimension']
df_means = df_means.set_index('dimension').loc[dim_order].reset_index()
mins_idx = df_means.groupby(['dimension'])['rating'].transform(min) == df_means['rating']
mins = df_means[mins_idx]
maxs_idx = df_means.groupby(['dimension'])['rating'].transform(max) == df_means['rating']
maxs = df_means[maxs_idx]
df_mins = mins[['word2', 'dimension', 'color']].merge(df_viz[['word2', 'dimension', 'color', 'rating']], how='left', on=['dimension', 'color', 'word2'])
df_maxs = maxs[['word1', 'dimension', 'color']].merge(df_viz[['word1', 'dimension', 'color', 'rating']], how='left', on=['dimension', 'color', 'word1'])
display(df_mins)
display(df_maxs)
| word2 | dimension | color | rating | |
|---|---|---|---|---|
| 0 | hot | cold-hot | blue | 1 |
| 1 | hot | cold-hot | blue | 1 |
| 2 | hot | cold-hot | blue | 1 |
| 3 | hot | cold-hot | blue | 1 |
| 4 | hot | cold-hot | blue | 2 |
| ... | ... | ... | ... | ... |
| 4555 | passive | active-passive | red | 2 |
| 4556 | passive | active-passive | red | 5 |
| 4557 | passive | active-passive | red | 2 |
| 4558 | passive | active-passive | red | 3 |
| 4559 | passive | active-passive | red | 3 |
4560 rows × 4 columns
| word1 | dimension | color | rating | |
|---|---|---|---|---|
| 0 | cold | cold-hot | red | 7 |
| 1 | cold | cold-hot | red | 7 |
| 2 | cold | cold-hot | red | 7 |
| 3 | cold | cold-hot | red | 6 |
| 4 | cold | cold-hot | red | 7 |
| ... | ... | ... | ... | ... |
| 4547 | active | active-passive | brown | 6 |
| 4548 | active | active-passive | brown | 5 |
| 4549 | active | active-passive | brown | 4 |
| 4550 | active | active-passive | brown | 4 |
| 4551 | active | active-passive | brown | 3 |
4552 rows × 4 columns
sns.set_style('darkgrid')
all_colors = {color: color for color in df_viz['color']}
fig, ax1 = plt.subplots(figsize=(3, 8))
sns.pointplot(data=df_viz, y='word1', x='rating', hue='color', #scale='width',
palette=all_colors, join=False, dodge=False, ax=ax1, ci=.95)
ax2 = ax1.twinx()
sns.pointplot(data=df_viz, y='word2', x='rating', hue='color', #scale='area',
palette=all_colors, join=False, dodge=False, ax=ax2, ci=.95)
ax1.set(ylabel='')
ax2.set(ylabel='')
ax1.get_legend().remove()
ax2.get_legend().remove()
ax1.set(xlim=[1, 7], xticks=[1, 2, 3, 4, 5, 6, 7])
#plt.savefig('figures/color_ratings.pdf', bbox_inches='tight')
#plt.savefig('figures/color_ratings.png', bbox_inches='tight')
[(1.0, 7.0), [<matplotlib.axis.XTick at 0x7fb028429a50>, <matplotlib.axis.XTick at 0x7fb031291f10>, <matplotlib.axis.XTick at 0x7faff4e9f7d0>, <matplotlib.axis.XTick at 0x7fb014579510>, <matplotlib.axis.XTick at 0x7fb014580d90>, <matplotlib.axis.XTick at 0x7fb014580d50>, <matplotlib.axis.XTick at 0x7fb014580190>]]
sns.set_style('whitegrid')
mins_colors = {color: color for color in mins['color']}
maxs_colors = {color: color for color in maxs['color']}
fig, ax1 = plt.subplots(figsize=(3, 7))
sns.violinplot(data=df_maxs, y='word1', x='rating', hue='color', #scale='width',
palette=maxs_colors, dodge=False, ax=ax1, inner=None, cut=0)
ax2 = ax1.twinx()
sns.violinplot(data=df_mins, y='word2', x='rating', hue='color', #scale='area',
palette=mins_colors, dodge=False, ax=ax2, inner=None, cut=0)
plt.setp(ax1.collections, alpha=.8)
plt.setp(ax2.collections, alpha=.8)
ax1.set(ylabel='')
ax2.set(ylabel='')
ax1.get_legend().remove()
ax2.get_legend().remove()
ax1.set(xlim=[1, 7], xticks=[1, 2, 3, 4, 5, 6, 7])
plt.savefig('figures/color_ratings.pdf', bbox_inches='tight')
plt.savefig('figures/color_ratings.png', bbox_inches='tight')
sns.set_style('darkgrid')
df_blind = df_viz[df_viz['group'] == 'blind'].groupby(['group', 'dimension', 'color', 'word1', 'word2']).mean().reset_index()
df_sighted = df_viz[df_viz['group'] == 'sighted'].groupby(['group', 'dimension', 'color', 'word1', 'word2']).mean().reset_index()
df_scatter = pd.concat([df_blind, df_sighted])
df_scatter['colordim'] = df_scatter['color'] + '_' + df_scatter['dimension']
df_scatter = df_scatter.sort_values('cosine_fic_z')
#means_colors = {row['colordim']: row['color'] for _, row in df_scatter.iterrows()}
means_colors = {row['color']: row['color'] for _, row in df_scatter.iterrows()}
g = sns.FacetGrid(df_scatter, hue='color', col='group', height=5, palette=means_colors)
g.map(plt.scatter, 'cosine_fic_z', 'rating', s=12)
for i, dimension in enumerate(df_blind['dimension'].unique()):
sns.regplot(x='cosine_fic_z', y='rating', color='black', #line_kws={'linewidth': 1.5},
data=df_blind[df_blind['dimension'] == dimension],
ax=g.axes[0][0], ci=False, scatter=False)
line = g.axes[0][0].lines[-1]
y = line.get_ydata()
x = line.get_xdata()
if i % 2 == 0:
#if x[0] < -.5:
x = x[0] - .05
y = y[0]
ha = 'right'
else:
x = x[-1] + .05
y = y[-1]
ha = 'left'
g.axes[0][0].annotate(dimension,
xy=(x,y),
color=line.get_color(),
size=8, va='center', ha=ha)
for i, dimension in enumerate(df_sighted['dimension'].unique()):
sns.regplot(x='cosine_fic_z', y='rating', color='black', #line_kws={'linewidth': 1.5},
data=df_sighted[df_sighted['dimension'] == dimension],
ax=g.axes[0][1], ci=False, scatter=False)
line = g.axes[0][1].lines[-1]
y = line.get_ydata()
x = line.get_xdata()
if i % 3 == 1:
#if x[0] < -.5:
x = x[0] - .05
y = y[0]
ha = 'right'
else:
x = x[-1] + .05
y = y[-1]
ha = 'left'
g.axes[0][1].annotate(dimension,
xy=(x,y),
color=line.get_color(),
size=8, va='center', ha=ha)
i += 1
g.set(xlabel='COCA-fiction embedding projection')
g.axes[0][0].set(ylabel='mean participant rating')
g.axes[0][1].set(ylabel='')
g.axes[0][0].set(title='blind')
g.axes[0][1].set(title='sighted')
g.set(ylim=[.5, 7.5], xlim=[-2.9, 2.9])
plt.savefig('figures/scatter_dimension.pdf', bbox_inches='tight')
plt.savefig('figures/scatter_dimension.png', bbox_inches='tight')
sns.set_style('darkgrid')
df_blind = df_viz[df_viz['group'] == 'blind'].groupby(['group', 'dimension', 'color', 'word1', 'word2']).mean().reset_index()
df_sighted = df_viz[df_viz['group'] == 'sighted'].groupby(['group', 'dimension', 'color', 'word1', 'word2']).mean().reset_index()
df_scatter = pd.concat([df_blind, df_sighted])
df_scatter['colordim'] = df_scatter['color'] + '_' + df_scatter['dimension']
df_scatter = df_scatter.sort_values('cosine_fic_z')
#means_colors = {row['colordim']: row['color'] for _, row in df_scatter.iterrows()}
means_colors = {row['color']: row['color'] for _, row in df_scatter.iterrows()}
g = sns.FacetGrid(df_scatter, hue='color', col='group', height=5, palette=means_colors, aspect=.5, sharex=True)
g.map(plt.scatter, 'cosine_fic_z', 'rating', s=10)
g.map(sns.regplot, 'cosine_fic_z', 'rating', scatter=False, ci=False)#, linewidth=.5)
g.set(xlabel='COCA-fiction\nembedding projection')
g.axes[0][0].set(ylabel='mean participant rating')
g.axes[0][0].set(title='blind')
g.axes[0][1].set(title='sighted')
g.set(ylim=[.75, 7.25], xlim=[-2.9, 2.9])
plt.savefig('figures/scatter_color.pdf', bbox_inches='tight')
plt.savefig('figures/scatter_color.png', bbox_inches='tight')
sns.set_style('darkgrid')
df_blind = df_viz[df_viz['group'] == 'blind'].groupby(['group', 'dimension', 'color', 'word1', 'word2']).mean().reset_index()
df_sighted = df_viz[df_viz['group'] == 'sighted'].groupby(['group', 'dimension', 'color', 'word1', 'word2']).mean().reset_index()
df_scatter = pd.concat([df_blind, df_sighted])
df_scatter['colordim'] = df_scatter['color'] + '_' + df_scatter['dimension']
df_scatter = df_scatter.sort_values('cosine_fic_z')
#means_colors = {row['colordim']: row['color'] for _, row in df_scatter.iterrows()}
means_colors = {row['color']: row['color'] for _, row in df_scatter.iterrows()}
g = sns.FacetGrid(df_scatter, hue='color', col='group', height=5, palette=means_colors)
g.map(plt.scatter, 'cosine_fic_z', 'rating', s=10)
g.map(sns.regplot, 'cosine_fic_z', 'rating', scatter=False, ci=False)#, linewidth=.5)
g.set(xlabel='COCA-fiction embedding projection')
g.axes[0][0].set(ylabel='mean participant rating')
g.axes[0][0].set(title='blind')
g.axes[0][1].set(title='sighted')
g.set(ylim=[.5, 7.5], xlim=[-2.9, 2.9])
plt.savefig('figures/scatter_color.pdf', bbox_inches='tight')
plt.savefig('figures/scatter_color.png', bbox_inches='tight')
convert_notebook('data_prep')